import pandas as pd
from sklearn.model_selection import train_test_split
df_train = pd.read_csv(r"D:\NLP\nlp_data\competition\datagrand\datagrand_2021_train.csv")
df_test = pd.read_csv(r"D:\NLP\nlp_data\competition\datagrand\datagrand_2021_test.csv")
train_ids, val_ids = train_test_split(list(range(len(df_train))), test_size=0.3, random_state=2021)
df_val = df_train.loc[val_ids].reset_index(drop=True)
df_train = df_train.loc[train_ids].reset_index(drop=True)
charset = set()
for text in df_train['text']:
    for char in text.split(" "):
        charset.add(char)
for text in df_test['text']:
    for char in text.split(" "):
        charset.add(char)
id2char = [ '，', '。', '！', '？'] + list(charset)

with open('vocab.txt','w',encoding='utf-8') as fwriter:
    for token in id2char:
        fwriter.write(token + "\n")

df_val.to_csv('valid.csv')
df_train.to_csv("train.csv")